Activate next cell to toggle code on and off
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)
# This line will hide code by default when the notebook is eåxported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)
modelthat the corresponding linear decision boundary
\begin{equation} \text{model}\left(\mathbf{x},\mathbf{w}\right) = \mathbf{x}^T \mathbf{w} = 0 \end{equation}separates the two classes as well as is possible using a linear model
model, like softmaxwhose minimum provides us with the weights that service our ideal as best as possible
where here $f$ is some parameterized or unparameterized nonlinear function or feature transformation
where $f_1,\,f_2,\,...\,f_B$ are nonlinear parameterized or unparameterized feature transformations and $w_0$ through $w_B$
model def softmax(w):
cost = np.sum(np.log(1 + np.exp(-y*model(x,w))))
return cost/float(len(y))
model - linear or nonlinear - so we can push it to the backmodel function too, since will look essentially the same throughout all examplesmodel will always look like# an implementation of our model employing a nonlinear feature transformation
def model(x,w):
# feature transformation
f = feature_transforms(x,w[0])
# tack a 1 onto the top of each input point all at once
o = np.ones((1,np.shape(f)[1]))
f = np.vstack((o,f))
# compute linear combination and return
a = np.dot(f.T,w[1])
return a
model is then (reduces to our typical linear model)# the trivial linear feature transformation
def feature_transforms(x):
return x
# parameters for our two runs of gradient descent
w = 0.1*np.random.randn(2,1); max_its = 500; alpha_choice = 10**(2)
# run on normalized data
run = nonlib.basic_runner.Setup(x,y,feature_transforms,'softmax',normalize = 'standard')
run.fit(w=w,alpha_choice = alpha_choice,max_its = max_its)
</figure> </p>
model is thenPython implementation# our quadratic feature transformation
def feature_transforms(x):
# calculate feature transform
f = np.array([(x.flatten()**d) for d in range(1,3)])
return f
# parameters for our two runs of gradient descent
w = 0.1*np.random.randn(3,1); max_its = 500; alpha_choice = 10**(2)
# run on normalized data
run = nonlib.basic_runner.Setup(x,y,feature_transforms,'softmax',normalize = 'standard')
run.fit(w=w,alpha_choice = alpha_choice,max_its = max_its)
model of the formPython our feature_transforms function# a elliptical feature transformation
def feature_transforms(x):
# calculate feature transform
f = x**2
return f
# parameters for our two runs of gradient descent
w = 0.1*np.random.randn(3,1); max_its = 200; alpha_choice = 10**(1)
# run on normalized data
run = nonlib.basic_runner.Setup(x,y,feature_transforms,'softmax',normalize = 'none')
run.fit(w=w,alpha_choice = alpha_choice,max_its = max_its)
- in `Python` our `feature_transforms` function
# our nonlinearity, known as a feature transformation
def feature_transforms(x,w):
# tack a 1 onto the top of each input point all at once
o = np.ones((1,np.shape(x)[1]))
x = np.vstack((o,x))
# calculate feature transform
f = np.sin(np.dot((x).T,w)).T
return f
# parameters for our two runs of gradient descent
scale = 2
w = [scale*np.random.randn(3,1),scale*np.random.randn(2,1)]
max_its = 1000; alpha_choice = 10**(-1)
# run on normalized data
run = nonlib.basic_runner.Setup(x,y,feature_transforms,'softmax',normalize = 'standard')
run.fit(w=w,alpha_choice = alpha_choice,max_its = max_its)
where $i + j \leq D$.
In Python we can implement this in a feature_transforms as follows.
# a elliptical feature transformation
def feature_transforms(x):
# calculate feature transform
f = []
for i in range(0,D):
for j in range(0,D-i):
if i > 0 or j > 0:
term = (x[0,:]**i)*((x[1,:])**j)
f.append(term)
return np.array(f)
# run one versus all
max_its = 1500; alpha_choice = 10**(0); w = 0.1*np.random.randn(6,1)
combined_weights, count_history = nonlib.one_versus_all.train(x,y,feature_transforms,alpha_choice = alpha_choice,max_its = max_its,w = w)
# parameters for our two runs of gradient descent
w = 0.1*np.random.randn(6,3); max_its = 1500; alpha_choice = 10**(0)
# run on normalized data
run = nonlib.basic_runner.Setup(x,y,feature_transforms,'multiclass_softmax',normalize = 'standard')
run.fit(w=w,alpha_choice = alpha_choice,max_its = max_its)